%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
file_1 = r"shhs1-dataset-0.14.0.csv"
file_2 = r"shhs2-dataset-0.14.0.csv"
file_3 = r"shhs-interim-followup-dataset-0.14.0.csv"
shhs1 = pd.read_csv(file_1, engine='python')
shhs2 = pd.read_csv(file_2, engine='python')
shhs_inter = pd.read_csv(file_3, engine='python')
shhs = shhs1.merge(shhs2, on="nsrrid")
shhs = shhs.merge(shhs_inter, on="nsrrid")
shhs.columns = map(str.lower, shhs.columns)
shhs
shhs.ess_s1.hist(bins = 24)
shhs.ess_interim.hist(bins = 24)
shhs.ess_s2.hist(bins = 24)
import matplotlib.pyplot as plt
import numpy as np
import scipy
import scipy.stats
size = 30000
x = np.arange(np.nan_to_num(shhs.ess_s2).shape[0])
y = shhs.ess_s1 #scipy.int_(np.round_(scipy.stats.vonmises.rvs(5,size=size)*47))
y = y[np.logical_not(np.isnan(y))]
h = plt.hist(y, bins=range(48))
dist_names = ['gamma', 'beta', 'rayleigh', 'norm', 'pareto', 'loggamma']
for dist_name in dist_names:
dist = getattr(scipy.stats, dist_name)
params = dist.fit(y)
print(dist_name)
print(params)
arg = params[:-2]
loc = params[-2]
scale = params[-1]
if arg:
pdf_fitted = dist.pdf(x, *arg, loc=loc, scale=scale) * size
else:
pdf_fitted = dist.pdf(x, loc=loc, scale=scale) * size
plt.plot(pdf_fitted, label=dist_name)
plt.xlim(0,47)
plt.legend(loc='upper right')
plt.show()
from scipy.stats import beta
#mean, var, skew, kurt =beta.stats(2.7453967765987652, 8.855531017668572, moments='mvsk')
import matplotlib.pyplot as plt
import numpy as np
import scipy
import scipy.stats
size = 30000
x = np.arange(np.nan_to_num(shhs.ess_interim).shape[0])
y = shhs.ess_interim
y = y[np.logical_not(np.isnan(y))] #scipy.int_(np.round_(scipy.stats.vonmises.rvs(5,size=size)*47))
h = plt.hist(y, bins=range(48))
dist_names = ['gamma', 'rayleigh', 'norm', 'pareto', 'loggamma']
for dist_name in dist_names:
dist = getattr(scipy.stats, dist_name)
params = dist.fit(y)
print(dist_name)
print(params)
arg = params[:-2]
loc = params[-2]
scale = params[-1]
if arg:
pdf_fitted = dist.pdf(x, *arg, loc=loc, scale=scale) * size
else:
pdf_fitted = dist.pdf(x, loc=loc, scale=scale) * size
plt.plot(pdf_fitted, label=dist_name)
plt.xlim(0,47)
plt.legend(loc='upper right')
plt.show()
import matplotlib.pyplot as plt
import numpy as np
import scipy
import scipy.stats
size = 30000
x = np.arange(np.nan_to_num(shhs.ess_s2).shape[0])
y = shhs.ess_s2 #scipy.int_(np.round_(scipy.stats.vonmises.rvs(5,size=size)*47))
y = y[np.logical_not(np.isnan(y))]
h = plt.hist(y, bins=range(48))
dist_names = ['gamma', 'beta', 'rayleigh', 'norm', 'pareto', 'loggamma']
for dist_name in dist_names:
dist = getattr(scipy.stats, dist_name)
params = dist.fit(y)
print(dist_name)
print(params)
arg = params[:-2]
loc = params[-2]
scale = params[-1]
if arg:
pdf_fitted = dist.pdf(x, *arg, loc=loc, scale=scale) * size
else:
pdf_fitted = dist.pdf(x, loc=loc, scale=scale) * size
plt.plot(pdf_fitted, label=dist_name)
plt.xlim(0,47)
plt.legend(loc='upper right')
plt.show()
from scipy import stats
#w, p =
stats.wilcoxon(shhs.ess_s1, shhs.ess_s2)
stats.wilcoxon(shhs.ess_interim, shhs.ess_s2)
stats.wilcoxon(shhs.ess_s1, shhs.ess_interim)
stats.ks_2samp(shhs.ess_s1, shhs.ess_s2)
stats.ks_2samp(shhs.ess_interim, shhs.ess_s1)
stats.ks_2samp(shhs.ess_interim, shhs.ess_s2)
Úgy tűnik nincs eltérés a két mérés eloszlása között.
import plotly.express as px
fig = px.density_heatmap(shhs, x="ess_s1", y="ess_s2")
fig.show()
fig = px.density_heatmap(shhs, x="ess_s2", y="ess_interim")
fig.show()
import plotly.express as px
fig = px.scatter_3d(shhs.loc[shhs.mdsa02>0,:], x="ess_s1", y="ess_s2", z="ess_interim", color = "mdsa02", opacity=0.3)
fig.show()
import numpy as np
np.nanmean(shhs.ess_s1.values-shhs.ess_s2.values)
np.nanmean(shhs.ess_s1.values-shhs.ess_interim.values)
np.nanmean(-shhs.ess_s2.values+shhs.ess_interim.values)
plt.hist(shhs.ess_s1.values-shhs.ess_s2.values)
import numpy as np
def concordance_correlation_coefficient(y_true, y_pred,
sample_weight=None,
multioutput='uniform_average'):
"""Concordance correlation coefficient.
The concordance correlation coefficient is a measure of inter-rater agreement.
It measures the deviation of the relationship between predicted and true values
from the 45 degree angle.
Read more: https://en.wikipedia.org/wiki/Concordance_correlation_coefficient
Original paper: Lawrence, I., and Kuei Lin. "A concordance correlation coefficient to evaluate reproducibility." Biometrics (1989): 255-268.
Parameters
----------
y_true : array-like of shape = (n_samples) or (n_samples, n_outputs)
Ground truth (correct) target values.
y_pred : array-like of shape = (n_samples) or (n_samples, n_outputs)
Estimated target values.
Returns
-------
loss : A float in the range [-1,1]. A value of 1 indicates perfect agreement
between the true and the predicted values.
Examples
--------
>>> from sklearn.metrics import concordance_correlation_coefficient
>>> y_true = [3, -0.5, 2, 7]
>>> y_pred = [2.5, 0.0, 2, 8]
>>> concordance_correlation_coefficient(y_true, y_pred)
0.97678916827853024
"""
cor=np.corrcoef(y_true,y_pred)[0][1]
mean_true=np.mean(y_true)
mean_pred=np.mean(y_pred)
var_true=np.var(y_true)
var_pred=np.var(y_pred)
sd_true=np.std(y_true)
sd_pred=np.std(y_pred)
numerator=2*cor*sd_true*sd_pred
denominator=var_true+var_pred+(mean_true-mean_pred)**2
return numerator/denominator
n_samples=1000
y_true = np.arange(n_samples)
y_pred = y_true + 5
plt.scatter(shhs.ess_s2, shhs.ess_interim)
concordance_correlation_coefficient(np.nan_to_num(shhs.ess_s1), np.nan_to_num(shhs.ess_s2))
concordance_correlation_coefficient(np.nan_to_num(shhs.ess_s1), np.nan_to_num(shhs.ess_interim))
concordance_correlation_coefficient(np.nan_to_num(shhs.ess_s2), np.nan_to_num(shhs.ess_interim))
A cikkben ez szerepelt: "The Rc value for Lin's concordance coefficient was 0.748 (the correlation is poor if the value is less than 0.9)."
shhs
shhs_interim_ess = shhs.loc[:,["sitread","watchtv","sitpublic","ridecar","resting","sittalk","sitafterlunch","stoppedcar","atdinner", "whiledriving"]]
shhs_interim_ess
shhs_1_ess = shhs.loc[:,["sitrd02","watv02","sitpub02","pgrcar02","lydwn02","sittlk02","sitlch02","incar02","attabl02","drive02"]]
shhs_1_ess
#SitRd02 WaTV02 SitPub02 PgrCar02 LyDwn02 SitTlk02 SitLch02 InCar02 AtTabl02 Drive02
#sh319a sh319b sh319c sh319d sh319e sh319f sh319g sh319h sh319i sh319j
#sitRead watchTV sitPublic rideCar resting sitTalk sitAfterLunch stoppedCar atDinner whileDriving bpmeds aspirin drive
#["sitrd02","watv02","sitpub02","pgrcar02","lydwn02","sittlk02","sitlch02","incar02","attabl02","drive02"]
#["sh319a","sh319b","sh319c","sh319d","sh319e","sh319f","sh319g","sh319h","sh319i","sh319j"]
#["sitread","watchtv","sitpublic","ridecar","resting","sittalk","sitafterLunch","stoppedcar","atdinner", "whiledriving"]
shhs_2_ess = shhs.loc[:,["sh319a","sh319b","sh319c","sh319d","sh319e","sh319f","sh319g","sh319h","sh319i","sh319j"]]
shhs_2_ess
plt.bar(["sitread","watchtv","sitpublic","ridecar","resting","sittalk","sitafterlunch","stoppedcar","atdinner", "whiledriving"],np.nanmean(shhs_2_ess.values-shhs_1_ess.values, axis = 0))
plt.xticks(rotation=45, ha='right')
plt.bar(["sitread","watchtv","sitpublic","ridecar","resting","sittalk","sitafterlunch","stoppedcar","atdinner", "whiledriving"],np.nanstd(shhs_2_ess.values-shhs_1_ess.values, axis = 0))
plt.xticks(rotation=45, ha='right')
plt.bar(["sitread","watchtv","sitpublic","ridecar","resting","sittalk","sitafterlunch","stoppedcar","atdinner", "whiledriving"],np.nanmean(shhs_2_ess.values-shhs_interim_ess.values, axis = 0))
plt.xticks(rotation=45, ha='right')
plt.bar(["sitread","watchtv","sitpublic","ridecar","resting","sittalk","sitafterlunch","stoppedcar","atdinner", "whiledriving"],np.nanstd(shhs_2_ess.values-shhs_interim_ess.values, axis = 0))
plt.xticks(rotation=45, ha='right')
plt.bar(["sitread","watchtv","sitpublic","ridecar","resting","sittalk","sitafterlunch","stoppedcar","atdinner", "whiledriving"],np.nanmean(shhs_1_ess.values-shhs_interim_ess.values, axis = 0))
plt.xticks(rotation=45, ha='right')
fig, axs = plt.subplots(10, figsize=(12, 23), sharey=True)
arr = ["sitread","watchtv","sitpublic","ridecar","resting","sittalk","sitafterlunch","stoppedcar","atdinner", "whiledriving"]
for i in range(10):
axs[i].hist(shhs_2_ess.values[:,i]-shhs_1_ess.values[:,i], bins = 7)
axs[i].set_title(arr[i])
plt.plot()
#plt.hist(shhs_2_ess.values[:,2]-shhs_1_ess.values[:,2], bins= 7)
import scipy.spatial as sp
cos_dist_m = 1-sp.distance.cdist(shhs_1_ess.values,shhs_interim_ess.values, 'cosine')
from scipy import spatial
cos_dist = []
euclidean_dist = []
mod_euclidean_dist = []
mink_dist = []
for i in range(shhs_1_ess.shape[0]):
cos_dist.append(1 - spatial.distance.cosine(shhs_1_ess.values[i,:],shhs_interim_ess.values[i,:]))
euclidean_dist.append(spatial.distance.euclidean(np.nan_to_num(shhs_1_ess.values[i,:]),np.nan_to_num(shhs_interim_ess.values[i,:])))
mod_euclidean_dist.append(spatial.distance.euclidean(np.nan_to_num(shhs_1_ess.values[i,:])**3,np.nan_to_num(shhs_interim_ess.values[i,:])**3))
mink_dist.append(spatial.distance.minkowski(np.nan_to_num(shhs_1_ess.values[i,:]),np.nan_to_num(shhs_interim_ess.values[i,:]),1))
print("Cos távolság:")
print(np.nanmean(cos_dist))
print("\nEuklideszi távolság:")
print(np.nanmean(euclidean_dist))
print("\nBence féle módosÃtott távolság:")
print(np.nanmean(mod_euclidean_dist))
print("\nMinkowski távolság:")
print(np.nanmean(mink_dist))
plt.plot(np.array(mod_euclidean_dist)/10, alpha = 0.5)
plt.plot(euclidean_dist, alpha = 0.5)
BMI + ESS + kor és RDI (espiratory disturbance index) kapcsolata
import numpy as np
from sklearn.linear_model import LinearRegression
X = np.nan_to_num(shhs.loc[:,["sitrd02","watv02","sitpub02","pgrcar02","lydwn02","sittlk02","sitlch02","incar02","attabl02","drive02", "age_s1", "bmi_s1"]].values)
# y = 1 * x_0 + 2 * x_1 + 3
y = np.nan_to_num(shhs.loc[:,['rdi0p_x']].values)
reg = LinearRegression().fit(X, y)
print(reg.score(X, y))
print(reg.coef_)
dict(zip(list(reg.coef_.flatten()), ["sitrd02","watv02","sitpub02","pgrcar02","lydwn02","sittlk02","sitlch02","incar02","attabl02","drive02", "age_s1", "bmi_s1"]))
X = np.nan_to_num(shhs.loc[:,["sh319a","sh319b","sh319c","sh319d","sh319e","sh319f","sh319g","sh319h","sh319i","sh319j", "age_s2", "bmi_s2"]].values)
# y = 1 * x_0 + 2 * x_1 + 3
y = np.nan_to_num(shhs.loc[:,['rdi0p_y']].values)
reg = LinearRegression().fit(X, y)
print(reg.score(X, y))
print(reg.coef_)
dict(zip(list(reg.coef_.flatten()), ["sitrd02","watv02","sitpub02","pgrcar02","lydwn02","sittlk02","sitlch02","incar02","attabl02","drive02", "age_s2", "bmi_s2"]))
plt.scatter(shhs.rdi0p_x, shhs.ess_s1)
plt.hist(np.nanstd(shhs_2_ess.values-shhs_1_ess.values, axis = 1))
plt.scatter(np.nanmean(shhs_2_ess.values-shhs_1_ess.values, axis = 1), np.nanstd(shhs_2_ess.values-shhs_1_ess.values, axis = 1))
plt.scatter(np.nanmean(np.random.randint(low=-3, high=3, size=(50000,)).reshape(5000,10), axis = 1), np.nanstd(np.random.randint(low=-3, high=3, size=(50000,)).reshape(5000,10), axis = 1))
plt.scatter(np.nanmean((np.random.normal(0, 1, 50000).astype(int)).reshape(5000,10), axis = 1), np.nanstd((np.random.normal(0, 1, 50000).astype(int)).reshape(5000,10), axis = 1))
no = (shhs_2_ess.values-shhs_1_ess.values)>0
csokken = (shhs_2_ess.values-shhs_1_ess.values)<0
np.sum(no, axis = 1)
szampar = pd.DataFrame({'no':np.sum(no, axis = 1), 'csokken':np.sum(csokken, axis = 1)})
fig = px.density_heatmap(szampar, x = "no", y = "csokken")
fig.show()
no = (shhs_2_ess.values-shhs_interim_ess.values)>0
csokken = (shhs_2_ess.values-shhs_interim_ess.values)<0
np.sum(no, axis = 1)
szampar = pd.DataFrame({'no':np.sum(no, axis = 1), 'csokken':np.sum(csokken, axis = 1)})
fig = px.density_heatmap(szampar, x = "no", y = "csokken")
fig.show()
import plotly.graph_objects as go
import ipywidgets as widgets
import pandas as pd
import numpy as np
plot = shhs.copy()
#shhs.ess_s2, shhs.ess_interim
plot["ess_s1_cat"] = pd.cut((plot['ess_s1']),bins=[0,5,10,12,15,24],
labels=["Low normal", "Higher normal", "Mild","Moderate","Severe"])
plot["ess_interim_cat"] = pd.cut((plot['ess_interim']), bins=[0,5,10,12,15,24],
labels=["Low normal", "Higher normal", "Mild","Moderate","Severe"])
plot["ess_s2_cat"] = pd.cut((plot['ess_s2']),bins=[0,5,10,12,15,24],
labels=["Low normal", "Higher normal", "Mild","Moderate","Severe"])
#plot = plot.dropna(subset=['NECK20_cat'])
#plot = plot.dropna(subset=['Dias_cat'])
# Build parcats dimensions
categorical_dimensions = ['ess_s1_cat', 'ess_interim_cat', 'ess_s2_cat'] #['Elhizas', 'event', 'ahi']
dimensions = [dict(values=plot[label], label=label) for label in categorical_dimensions]
# Build colorscale
color = np.zeros(len(plot), dtype='uint8')
colorscale = [[0, 'gray'], [0.2, 'gray'],
[0.2, 'firebrick'], [0.4, 'firebrick'],
[0.4, 'Blue'], [0.6, 'Blue'],
[0.6, 'Green'], [0.8, 'Green'],
[0.8, 'Yellow'], [1.0, 'Yellow']
]
cmin = -0.5
cmax = 4.5
# Build figure as FigureWidget
fig = go.FigureWidget(
data=[go.Scatter(x=plot.ess_s1, y=plot.ess_s2,
marker={'color': color, 'cmin': cmin, 'cmax': cmax,'opacity': 0.5,
'colorscale': colorscale, 'showscale': True,
'colorbar': {'tickvals': [0, 1, 2, 3, 4], 'ticktext': ['None', 'Red', 'Blue', 'Green', 'Yellow']}},
mode='markers'),
go.Parcats(domain={'y': [0, 0.4]}, dimensions=dimensions,
line={'colorscale': colorscale, 'cmin': cmin,
'cmax': cmax, 'color': color, 'shape': 'hspline'})#,
#go.Parcats(domain={'y': [0.2, 0.4]}, dimensions=dimensions,
# line={'colorscale': colorscale, 'cmin': cmin,
# 'cmax': cmax, 'color': color, 'shape': 'hspline'})
]
)
fig.update_layout(height=600, xaxis={'title': 'Umap 1'},
yaxis={'title': 'Umap 2', 'domain': [0.6, 1]},
dragmode='lasso', hovermode='closest')
#fig.update_traces(showscale=False)
# Build color selection widget
color_toggle = widgets.ToggleButtons(
options=['None', 'Red', 'Blue', 'Green', 'Yellow'],
index=1, description='Brush Color:', disabled=False)
# Update color callback
def update_color(trace, points, state):
# Compute new color array
new_color = np.array(fig.data[0].marker.color)
new_color[points.point_inds] = color_toggle.index
with fig.batch_update():
# Update scatter color
fig.data[0].marker.color = new_color
# Update parcats colors
fig.data[1].line.color = new_color
# Register callback on scatter selection...
fig.data[0].on_selection(update_color)
# and parcats click
fig.data[1].on_click(update_color)
# Display figure
widgets.VBox([color_toggle, fig])
import plotly.graph_objects as go
import ipywidgets as widgets
import pandas as pd
import numpy as np
plot = shhs.copy()
#shhs.ess_s2, shhs.ess_interim
plot["ess_s1_cat"] = pd.cut((plot['ess_s1']),bins=[0,8,10,14,24],
labels=["Nincs", "enyhe", "közepes","magas"])
plot["ess_interim_cat"] = pd.cut((plot['ess_interim']), bins=[0,8,10,14,24],
labels=["Nincs", "enyhe", "közepes","magas"])
plot["ess_s2_cat"] = pd.cut((plot['ess_s2']),bins=[0,8,10,14,24],
labels=["Nincs", "enyhe", "közepes","magas"])
#plot = plot.dropna(subset=['NECK20_cat'])
#plot = plot.dropna(subset=['Dias_cat'])
# Build parcats dimensions
categorical_dimensions = ['ess_s1_cat', 'ess_interim_cat', 'ess_s2_cat'] #['Elhizas', 'event', 'ahi']
dimensions = [dict(values=plot[label], label=label) for label in categorical_dimensions]
# Build colorscale
color = np.zeros(len(plot), dtype='uint8')
colorscale = [[0, 'gray'], [0.2, 'gray'],
[0.2, 'firebrick'], [0.4, 'firebrick'],
[0.4, 'Blue'], [0.6, 'Blue'],
[0.6, 'Green'], [0.8, 'Green'],
[0.8, 'Yellow'], [1.0, 'Yellow']
]
cmin = -0.5
cmax = 4.5
# Build figure as FigureWidget
fig = go.FigureWidget(
data=[go.Scatter(x=plot.ess_s1, y=plot.ess_s2,
marker={'color': color, 'cmin': cmin, 'cmax': cmax,'opacity': 0.5,
'colorscale': colorscale, 'showscale': True,
'colorbar': {'tickvals': [0, 1, 2, 3, 4], 'ticktext': ['None', 'Red', 'Blue', 'Green', 'Yellow']}},
mode='markers'),
go.Parcats(domain={'y': [0, 0.4]}, dimensions=dimensions,
line={'colorscale': colorscale, 'cmin': cmin,
'cmax': cmax, 'color': color, 'shape': 'hspline'})#,
#go.Parcats(domain={'y': [0.2, 0.4]}, dimensions=dimensions,
# line={'colorscale': colorscale, 'cmin': cmin,
# 'cmax': cmax, 'color': color, 'shape': 'hspline'})
]
)
fig.update_layout(height=600, xaxis={'title': 'Umap 1'},
yaxis={'title': 'Umap 2', 'domain': [0.6, 1]},
dragmode='lasso', hovermode='closest')
#fig.update_traces(showscale=False)
# Build color selection widget
color_toggle = widgets.ToggleButtons(
options=['None', 'Red', 'Blue', 'Green', 'Yellow'],
index=1, description='Brush Color:', disabled=False)
# Update color callback
def update_color(trace, points, state):
# Compute new color array
new_color = np.array(fig.data[0].marker.color)
new_color[points.point_inds] = color_toggle.index
with fig.batch_update():
# Update scatter color
fig.data[0].marker.color = new_color
# Update parcats colors
fig.data[1].line.color = new_color
# Register callback on scatter selection...
fig.data[0].on_selection(update_color)
# and parcats click
fig.data[1].on_click(update_color)
# Display figure
widgets.VBox([color_toggle, fig])
shhs["ess_s1_cat"] = pd.cut((shhs['ess_s1']),bins=[0,10,14,24],
labels=["0", "1", "2"])
shhs["ess_interim_cat"] = pd.cut((shhs['ess_interim']), bins=[0,10,14,24],
labels=["0", "1", "2"])
shhs["ess_s2_cat"] = pd.cut((shhs['ess_s2']),bins=[0,10,14,24],
labels=["0", "1", "2"])
plt.scatter(x = np.nanmean(shhs_2_ess.values-shhs_1_ess.values, axis = 1),
y = np.nanstd(shhs_2_ess.values-shhs_1_ess.values, axis = 1),
c = np.nan_to_num(shhs["ess_s2_cat"].fillna('0').astype(int)-shhs["ess_s1_cat"].fillna('0').astype(int)))
shhs.ess_s2_cat.fillna('0').astype(int)